Show the code
rm(list=ls()) Create a class to represent rational numbers. Do this using S4.
A constructor
A validator that ensures the denominator is non-zero.
A show method.
A simplify method, to obtain the simplest form (e.g. simplify(2/4) produces 1/2).
A quotient method (e.g. quotient(3/7) produces .42857143…). It should support a digits argument but only in the printing, not the returned result (Hint: what does print return?).
Addition, subtraction, multiplication, division. These should all return a rational.
You’ll (probably) need GCD and LCM as part of some of these calculations; include these functions using Rcpp. Even if you don’t need these functions for another calculation, include them.
rm(list=ls()) Defining GCD and LCM functions:
# Load Rcpp for GCD and LCM functions
library(Rcpp)
# Define GCD and LCM functions in a single C++ code block
cppFunction("
#include <cmath> // Include cmath for abs function
#include <numeric> // Include numeric for std::gcd and std::lcm functions
int C_gcd(int x, int y) {
return std::gcd(x, y);
}")
cppFunction("
int C_lcm(int x, int y) {
return (abs(x * y) / std::gcd(x, y));
}
")Setting up rational class:
# Set up rational class
# Define the S4 class
setClass("Rational",
slots = c(numerator = "numeric", denominator = "numeric"),
prototype = list(numerator = 0, denominator = 1)
)
# Validity
setValidity("Rational", function(object) {
if (object@denominator == 0) {
return("Denominator cannot be zero.")
}
TRUE
})Class "Rational" [in ".GlobalEnv"]
Slots:
Name: numerator denominator
Class: numeric numeric
# Simplify method-- this is just for internal use within constructor to obtain the simplest form
simplify_fraction <- function(numerator, denominator) {
divisor <- C_gcd(numerator, denominator)
numerator <- numerator / divisor
denominator <- denominator / divisor
list(numerator = numerator, denominator = denominator)
}
# Constructor
Rational <- function(numerator, denominator = 1) {
if (denominator == 0) {
stop("Error: Denominator cannot be zero.")
}
# Simplify the fraction upon creation
simplified <- simplify_fraction(numerator, denominator)
obj <- new("Rational", numerator = simplified$numerator, denominator = simplified$denominator)
obj # Return the valid object
}
# Show method
setMethod("show", "Rational", function(object) {
cat(object@numerator, "/", object@denominator, "\n")
})
# Simplify method for external use
setGeneric("simplify", function(object) standardGeneric("simplify"))[1] "simplify"
setMethod("simplify", "Rational", function(object) {
simplified <- simplify_fraction(object@numerator, object@denominator)
new("Rational", numerator = simplified$numerator, denominator = simplified$denominator)
})
# Quotient
setGeneric("quotient", function(object, digits = NULL) standardGeneric("quotient"))[1] "quotient"
setMethod("quotient", "Rational", function(object, digits = NULL) {
result <- object@numerator / object@denominator
if (!is.null(digits) && is.numeric(digits)) {
print(round(result, digits))
} else {
print(result)
}
invisible(result)
})
# Addition
setMethod("+", signature(e1 = "Rational", e2 = "Rational"), function(e1, e2) {
lcm_den <- C_lcm(e1@denominator, e2@denominator)
numerator <- e1@numerator * (lcm_den / e1@denominator) + e2@numerator * (lcm_den / e2@denominator)
Rational(numerator, lcm_den) # Automatically simplifies
})
# Subtraction
setMethod("-", signature(e1 = "Rational", e2 = "Rational"), function(e1, e2) {
lcm_den <- C_lcm(e1@denominator, e2@denominator)
numerator <- e1@numerator * (lcm_den / e1@denominator) - e2@numerator * (lcm_den / e2@denominator)
Rational(numerator, lcm_den) # Automatically simplifies
})
# Multiplication
setMethod("*", signature(e1 = "Rational", e2 = "Rational"), function(e1, e2) {
Rational(e1@numerator * e2@numerator, e1@denominator * e2@denominator) # Automatically simplifies
})
# Division
setMethod("/", signature(e1 = "Rational", e2 = "Rational"), function(e1, e2) {
if (e2@numerator == 0) stop("Cannot divide by a rational number with zero numerator.")
Rational(e1@numerator * e2@denominator, e1@denominator * e2@numerator) # Automatically simplifies
})r1: 24/6
r2: 7/230
r3: 0/4
# Creating objects
r1 <- Rational(24, 6) # This should simplify to 4/1
r2 <- Rational(7, 230) # This will stay as 7/230
r3 <- Rational(0, 4) # This should simplify to 0/1
r1 # 4/14 / 1
r2 # 7/2307 / 230
r3 #0/10 / 1
Evaluate the following code (remember you can tell Quarto not to stop on errors):
r14 / 1
r30 / 1
r1 + r2927 / 230
r1 - r2913 / 230
r1 * r214 / 115
r1 / r2920 / 7
r1 + r34 / 1
r1 * r30 / 1
r2 / r3Error in r2/r3: Cannot divide by a rational number with zero numerator.
quotient(r1)[1] 4
quotient(r2)[1] 0.03043478
quotient(r2, digits = 3)[1] 0.03
quotient(r2, digits = 3.14)[1] 0.03
quotient(r2, digits = "avocado")[1] 0.03043478
q2 <- quotient(r2, digits = 3)[1] 0.03
q2[1] 0.03043478
quotient(r3)[1] 0
simplify(r1)4 / 1
simplify(r2)7 / 230
simplify(r3)0 / 1
Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.
# Testing the validator for Rational objects with invalid inputs -- should all error
Rational(1, 0)Error in Rational(1, 0): Error: Denominator cannot be zero.
Rational("a", 2)Error in eval(expr, envir, enclos): Not compatible with requested type: [type=character; target=integer].
Rational(1, "b")Error in eval(expr, envir, enclos): Not compatible with requested type: [type=character; target=integer].
Rational("a", "b")Error in eval(expr, envir, enclos): Not compatible with requested type: [type=character; target=integer].
Note that there are a lot of choices to be made here. How are you going to store the class? Two numerics? A vector of length two? A formula? A string? What are users going to pass into the constructor? A string (“24/6”)? Two arguments? A vector?
There is no right answer to those questions. Make the best decision you can, and don’t be afraid to change it if your decision causes unforeseen difficulties.
You may not use any existing R functions or packages that would trivialize this assignment. (E.g. if you found an existing package that does this, or found a function that automatically produces the quotient or simplified version, that is not able to be used.)
Hint: It may be useful to define other functions that I don’t explicitly ask for.
Let’s revisit the art data from the last problem set. Use plotly for these.
library(plotly)Loading required package: ggplot2
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
# Load art sales df
art <- read.csv("df_for_ml_improved_new_market.csv")You may copy your plot from last time, or copy my plot from the solutions, or come up with your own new plot.
Overall, as seen in the stacked bar graph, more art of all genres seems to be getting sold as the years progress. However, from 1997-1999, it seemed like the genre of sales were all close to each other–one genre did not sell more than the others since not a lot of art was getting sold in general. From 1999-2012, we see a separation of the genres, where photography and sculpture genres sell much more than print, painting, and other genres (note that if a work of art qualified as more than 1 genre, we included them in their independent genres along with a category called “Two+ Genres”). This gap between the photography/sculpture genres and other genres widens from 1999-2012 as well, as emphasized by the line graph below meant to complement the bar graph for this question.
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.3 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks plotly::filter(), stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Get year and genre columns, clean up column names, and process two+ genres
genre <- art %>%
dplyr::select(year, starts_with("Genre")) %>%
rename_with(~ gsub("Genre___", "", .), starts_with("Genre___")) %>%
mutate(two_plus = rowSums(select(., Photography, Print, Sculpture, Painting, Others), na.rm = TRUE)) %>%
mutate("Two+ Genres" = ifelse(two_plus > 1, 1, 0)) %>%
select(-two_plus)
# Reshape to long format
genre_long <- genre %>%
pivot_longer(
cols = -year,
names_to = "genre",
values_to = "count"
)
# Summarize total sales for each genre per year
genre_summary <- genre_long %>%
group_by(year, genre) %>%
summarize(total_sales = sum(count, na.rm = TRUE)) %>%
ungroup() %>%
mutate(genre = factor(genre, levels = c("Photography", "Print", "Sculpture", "Painting", "Others", "Two+ Genres")))`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
# Interactive stacked bar chart using Plotly
plot_genre_distribution <- plot_ly(
data = genre_summary,
x = ~year,
y = ~total_sales,
color = ~genre,
type = "bar",
colors = "Set3"
) %>%
layout(
title = "Distribution of Genre of Sales Across Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Total Sales"),
barmode = "stack"
)
plot_genre_distributionThis should be a single interactive plot, with which a user can manipulate the view to be able to look at change over time overall, or by genre.
Below is the interactive plot, with buttons on the top to manipulate the genre view. The default is with all the genres, including the overall mean and median (in black).
As seen in the below graph’s mean and median line trends (as with the answers to PS4), there does seem to be a change in sales prices in USD over time. Photography art sales seem to sell for the highest prices, especially in years closest to 2012– they have the biggest increase in mean and median art sale prices and the biggest increase in sales price over time, as seen in the graphs below. This is followed by print sales, which seem to oscillate a lot more, painting, and sculpture. The latter three have a mean price of around 20000 dollars in 2012 while photography’s mean is over 40000 dollars. (Note that if a work of art included two or more genres, usually with the second genre being “Other”, we used the artwork’s primary genre.)
# Define a consistent color palette for genres and "Overall"
genre_colors <- c("Photography" = "#1f77b4", "Print" = "#ff7f0e",
"Sculpture" = "#2ca02c", "Painting" = "#d62728",
"Other" = "#9467bd", "Overall" = "black") # New color for "Overall"
# Calculate average sales price per genre per year
price_trend <- art %>%
dplyr::select(year, price_usd, starts_with("Genre")) %>%
rename_with(~ gsub("Genre___", "", .), starts_with("Genre___")) %>%
mutate(genre = case_when(
Photography == 1 ~ "Photography",
Print == 1 ~ "Print",
Sculpture == 1 ~ "Sculpture",
Painting == 1 ~ "Painting",
Others == 1 ~ "Other"
)) %>%
group_by(year, genre) %>%
summarize(
mean_price = mean(price_usd, na.rm = TRUE),
median_price = median(price_usd, na.rm = TRUE)
) %>%
ungroup() %>%
mutate(genre = factor(genre, levels = c("Photography", "Print", "Sculpture", "Painting", "Other")))`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
# Calculate overall mean and median across all genres per year
overall_trend <- art %>%
group_by(year) %>%
summarize(
overall_mean = mean(price_usd, na.rm = TRUE),
overall_median = median(price_usd, na.rm = TRUE)
)
# Initialize the plot
plot <- plot_ly()
# Add traces for each genre with consistent colors
for (g in levels(price_trend$genre)) {
plot <- plot %>%
add_trace(
data = subset(price_trend, genre == g),
x = ~year, y = ~mean_price,
type = 'scatter', mode = 'lines+markers',
name = paste(g, "- Mean"),
visible = TRUE,
line = list(dash = 'solid', color = genre_colors[[g]]),
marker = list(color = genre_colors[[g]])
) %>%
add_trace(
data = subset(price_trend, genre == g),
x = ~year, y = ~median_price,
type = 'scatter', mode = 'lines+markers',
name = paste(g, "- Median"),
visible = TRUE,
line = list(dash = 'dash', color = genre_colors[[g]]),
marker = list(color = genre_colors[[g]])
)
}
# Add overall mean and median traces with their own color
plot <- plot %>%
add_trace(
data = overall_trend,
x = ~year, y = ~overall_mean,
type = 'scatter', mode = 'lines+markers',
name = "Overall Mean",
visible = FALSE, # Set to FALSE initially; will be controlled by "Overall" button
line = list(dash = 'solid', color = genre_colors[["Overall"]]),
marker = list(color = genre_colors[["Overall"]])
) %>%
add_trace(
data = overall_trend,
x = ~year, y = ~overall_median,
type = 'scatter', mode = 'lines+markers',
name = "Overall Median",
visible = FALSE, # Set to FALSE initially; will be controlled by "Overall" button
line = list(dash = 'dash', color = genre_colors[["Overall"]]),
marker = list(color = genre_colors[["Overall"]])
)
# Define visibility settings for each genre button
num_genres <- length(levels(price_trend$genre))
visibility_settings <- lapply(1:num_genres, function(i) {
rep(FALSE, num_genres * 2 + 2) %>%
replace((i * 2 - 1):(i * 2), TRUE)
})
# Add visibility setting for the "All Genres" button to show all traces including overall
visibility_all <- rep(TRUE, num_genres * 2)
visibility_overall <- c(rep(FALSE, num_genres * 2), TRUE, TRUE)
# Define the buttons for each genre, "All Genres," and "Overall"
buttons <- list(
list(
method = "update",
args = list(list(visible = visibility_all), list(title = "Sales Price Over Time by Genre (Mean and Median)")),
label = "All Genres"
),
list(
method = "update",
args = list(list(visible = visibility_overall), list(title = "Overall Sales Price Over Time (Mean and Median)")),
label = "Overall"
)
)
# Add each genre-specific button to the buttons list
for (i in seq_along(levels(price_trend$genre))) {
genre_name <- levels(price_trend$genre)[i]
buttons <- append(buttons, list(
list(
method = "update",
args = list(list(visible = visibility_settings[[i]]), list(title = paste("Sales Price -", genre_name))),
label = genre_name
)
))
}
# Set up the layout with the updated buttons
plot <- plot %>%
layout(
title = "Sales Price Over Time by Genre (Mean and Median)",
xaxis = list(title = "Year"),
yaxis = list(title = "Sales Price (USD)"),
hovermode = "closest",
legend = list(orientation = "h", y = -0.3),
updatemenus = list(
list(
type = "buttons",
direction = "right",
x = 0.8, y = 2.15,
buttons = buttons
)
)
)
plotThese will be graded similar to last time:
Is the type of graph & choice of variables appropriate to answer the question? Is the graph clear and easy to interpret? Is the graph publication ready?
Repeat Problem 1 from PS04 using data.table.
library(data.table)
Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
library(nycflights13)Additionally,
Order both tables in descending mean delay. Both tables should use the airport names not the airport codes. Both tables should print all rows.
# Departures delay
flights <- data.table(flights)
#setDT(flights)
flights[, faa := dest]merged <- merge(flights[, faa := origin],
airports,
by = "faa",
all.x = TRUE)First table (reporting the mean and median departure delay per airport):
merged[, .(N = .N,
mean_delay = mean(dep_delay, na.rm = TRUE),
median_delay = median(dep_delay, na.rm = TRUE)),
by = name] |>
_[N >= 10, !"N"] |>
_[order(mean_delay, decreasing = TRUE)] name mean_delay median_delay
<char> <num> <num>
1: Newark Liberty Intl 15.10795 -1
2: John F Kennedy Intl 12.11216 -1
3: La Guardia 10.34688 -3
# Departures delay
flights <- data.table(flights)
#setDT(flights)
#setDT(airports)
flights[, faa := dest]
# Arrival delay
merged <- merge(flights[, faa := dest],
airports,
by = "faa",
all.x = TRUE)Second table (reporting the mean and median arrival delay per airport):
merged[, .(name = ifelse(is.na(first(name)), first(faa), first(name)),
N = .N,
mean_delay = mean(arr_delay, na.rm = TRUE),
median_delay = median(arr_delay, na.rm = TRUE)),
by = faa] |>
_[N >= 10, !c("faa", "N")] |>
_[order(mean_delay, decreasing = TRUE)] |>
print(x = _, nrows = 10000) name mean_delay median_delay
<char> <num> <num>
1: Columbia Metropolitan 41.76415094 28.0
2: Tulsa Intl 33.65986395 14.0
3: Will Rogers World 30.61904762 16.0
4: Jackson Hole Airport 28.09523810 15.0
5: Mc Ghee Tyson 24.06920415 2.0
6: Dane Co Rgnl Truax Fld 20.19604317 1.0
7: Richmond Intl 20.11125320 1.0
8: Akron Canton Regional Airport 19.69833729 3.0
9: Des Moines Intl 19.00573614 0.0
10: Gerald R Ford Intl 18.18956044 1.0
11: Birmingham Intl 16.87732342 -2.0
12: Theodore Francis Green State 16.23463687 1.0
13: Greenville-Spartanburg International 15.93544304 -0.5
14: Cincinnati Northern Kentucky Intl 15.36456376 -3.0
15: Savannah Hilton Head Intl 15.12950601 -1.0
16: Manchester Regional Airport 14.78755365 -3.0
17: Eppley Afld 14.69889841 -2.0
18: Yeager 14.67164179 -1.5
19: Kansas City Intl 14.51405836 0.0
20: Albany Intl 14.39712919 -4.0
21: General Mitchell Intl 14.16722038 0.0
22: Piedmont Triad 14.11260054 -2.0
23: Washington Dulles Intl 13.86420212 -3.0
24: Cherry Capital Airport 12.96842105 -10.0
25: James M Cox Dayton Intl 12.68048606 -3.0
26: Louisville International Airport 12.66938406 -2.0
27: Chicago Midway Intl 12.36422360 -1.0
28: Sacramento Intl 12.10992908 4.0
29: Jacksonville Intl 11.84483416 -2.0
30: Nashville Intl 11.81245891 -2.0
31: Portland Intl Jetport 11.66040210 -4.0
32: Greater Rochester Intl 11.56064461 -5.0
33: Hartsfield Jackson Atlanta Intl 11.30011285 -1.0
34: Lambert St Louis Intl 11.07846451 -3.0
35: Norfolk Intl 10.94909344 -4.0
36: Baltimore Washington Intl 10.72673385 -5.0
37: Memphis Intl 10.64531435 -2.5
38: Port Columbus Intl 10.60132291 -3.0
39: Charleston Afb Intl 10.59296847 -4.0
40: Philadelphia Intl 10.12719014 -3.0
41: Raleigh Durham Intl 10.05238095 -3.0
42: Indianapolis Intl 9.94043412 -3.0
43: Charlottesville-Albemarle 9.50000000 -5.0
44: Cleveland Hopkins Intl 9.18161129 -5.0
45: Ronald Reagan Washington Natl 9.06695204 -2.0
46: Burlington Intl 8.95099602 -4.0
47: Buffalo Niagara Intl 8.94595186 -5.0
48: Syracuse Hancock Intl 8.90392501 -5.0
49: Denver Intl 8.60650021 -2.0
50: Palm Beach Intl 8.56297210 -3.0
51: BQN 8.24549550 -1.0
52: Bob Hope 8.17567568 -3.0
53: Fort Lauderdale Hollywood Intl 8.08212154 -3.0
54: Bangor Intl 8.02793296 -9.0
55: Asheville Regional Airport 8.00383142 -1.0
56: PSE 7.87150838 0.0
57: Pittsburgh Intl 7.68099053 -5.0
58: Gallatin Field 7.60000000 -2.0
59: NW Arkansas Regional 7.46572581 -2.0
60: Tampa Intl 7.40852503 -4.0
61: Charlotte Douglas Intl 7.36031885 -3.0
62: Minneapolis St Paul Intl 7.27016886 -5.0
63: William P Hobby 7.17618819 -4.0
64: Bradley Intl 7.04854369 -10.0
65: San Antonio Intl 6.94537178 -9.0
66: South Bend Rgnl 6.50000000 -3.5
67: Louis Armstrong New Orleans Intl 6.49017497 -6.0
68: Key West Intl 6.35294118 7.0
69: Eagle Co Rgnl 6.30434783 -4.0
70: Austin Bergstrom Intl 6.01990875 -5.0
71: Chicago Ohare Intl 5.87661475 -8.0
72: Orlando Intl 5.45464309 -5.0
73: Detroit Metro Wayne Co 5.42996346 -7.0
74: Portland Intl 5.14157973 -5.0
75: Nantucket Mem 4.85227273 -3.0
76: Wilmington Intl 4.63551402 -7.0
77: Myrtle Beach Intl 4.60344828 -13.0
78: Albuquerque International Sunport 4.38188976 -5.5
79: George Bush Intercontinental 4.24079040 -5.0
80: Norman Y Mineta San Jose Intl 3.44817073 -7.0
81: Southwest Florida Intl 3.23814963 -5.0
82: San Diego Intl 3.13916574 -5.0
83: Sarasota Bradenton Intl 3.08243131 -5.0
84: Metropolitan Oakland Intl 3.07766990 -9.0
85: General Edward Lawrence Logan Intl 2.91439222 -9.0
86: San Francisco Intl 2.67289152 -8.0
87: SJU 2.52052659 -6.0
88: Yampa Valley 2.14285714 2.0
89: Phoenix Sky Harbor Intl 2.09704733 -6.0
90: Montrose Regional Airport 1.78571429 -10.5
91: Los Angeles Intl 0.54711094 -7.0
92: Dallas Fort Worth Intl 0.32212685 -9.0
93: Miami Intl 0.29905978 -9.0
94: Mc Carran Intl 0.25772849 -8.0
95: Salt Lake City Intl 0.17625459 -8.0
96: Long Beach -0.06202723 -10.0
97: Martha\\\\'s Vineyard -0.28571429 -11.0
98: Seattle Tacoma Intl -1.09909910 -11.0
99: Honolulu Intl -1.36519258 -7.0
100: STT -3.83590734 -9.0
101: John Wayne Arpt Orange Co -7.86822660 -11.0
102: Palm Springs Intl -12.72222222 -13.5
name mean_delay median_delay
Aircraft model 777-222 has the fastest average speed and made 4 flights.
planes <- data.table(planes)
#setDT(planes)merged <- merge(flights,
planes,
by = "tailnum",
all.x = TRUE)allmodels <- merged[, `:=`(num_flights = .N,
avg_mph = mean(distance/(air_time/60), na.rm = TRUE)),
by = model]
allmodels[allmodels[, .I[which.max(avg_mph)]],.(model, avg_mph, num_flights)] model avg_mph num_flights
<char> <num> <int>
1: 777-222 482.6254 4